############# HUMAN data #################
# Load thyroid data
GTEx_thyroid_female <- read.delim("~/BONOBO/data/GTEx_thyroid/GTEx_thyroid_female.txt")
GTEx_thyroid_male <- read.delim("~/BONOBO/data/GTEx_thyroid/GTEx_thyroid_male.txt")
# Load microarray data
GSE19783_miRNA_mRNA_expression <- read.csv("~/BONOBO/data/miRNA_mRNA_breastCancer/GSE19783_miRNA_mRNA_expression.txt", sep="")

# Check how many of the genes have Normal Distribution over all samples
pvals_thyroid_female = unlist(apply(GTEx_thyroid_female[,-1], MARGIN = 1, function(x){ks.test(as.numeric(x), rnorm(100, mean(x), sd(x)))$p.value}))
pvals_thyroid_male = unlist(apply(GTEx_thyroid_male[,-1], MARGIN = 1, function(x){ks.test(as.numeric(x), rnorm(100, mean(x), sd(x)))$p.value}))
pvals_breast = unlist(apply(GSE19783_miRNA_mRNA_expression, MARGIN = 1, function(x){ks.test(as.numeric(x), rnorm(100, mean(x), sd(x)))$p.value}))

# Plot boxplot of all pvalues of KS Test for normality
library(ggplot2)
pvals = c(pvals_thyroid_female, pvals_thyroid_male, pvals_breast)
fulldata = data.frame(pvals)
fulldata$data_label = c(rep("Thyroid_female", length(pvals_thyroid_female)), rep("Thyroid_male", length(pvals_thyroid_male)), rep("Breast_cancer", length(pvals_breast)))

# compute proportion of p-values less than 0.05
prop_thyroid_female = round(sum(pvals_thyroid_female < 0.05) * 100 / length(pvals_thyroid_female),2)
prop_thyroid_male = round(sum(pvals_thyroid_male < 0.05) * 100 / length(pvals_thyroid_male),2)
prop_breastCancer = round(sum(pvals_breast < 0.05) * 100 / length(pvals_breast),2)

c(prop_thyroid_female, prop_thyroid_male, prop_breastCancer)

# Boxplot: Normality Test
ggplot(fulldata, aes(x=data_label, y=pvals, fill = data_label)) + geom_boxplot() + 
  ggtitle("P-values of Kolmogorov-Smirnov Test for Normality") + 
  geom_hline(yintercept = 0.05, lty = "dashed" , color = "red") +
  geom_text(aes(1.3, 0.05, label = "pval = 0.05", vjust = -1), size = 3, col = "red")

# Find which genes are not normal
nn_GTEx_female = GTEx_thyroid_female[which(pvals_thyroid_female < 0.05),-1]
nn_GTEx_male = GTEx_thyroid_male[which(pvals_thyroid_male < 0.05),-1]
nn_breast = GSE19783_miRNA_mRNA_expression[which(pvals_breast < 0.05),]

# Check how many of the genes have Unimodal Distribution over all samples
library(diptest)
upvals_thyroid_female = unlist(apply(nn_GTEx_female, MARGIN = 1, function(x){dip.test(as.numeric(x))$p.value}))
upvals_thyroid_male = unlist(apply(nn_GTEx_male, MARGIN = 1, function(x){dip.test(as.numeric(x))$p.value}))
upvals_breast = unlist(apply(nn_breast, MARGIN = 1, function(x){dip.test(as.numeric(x))$p.value}))

# compute proportion of p-values less than 0.05
prop_thyroid_female = round(sum(upvals_thyroid_female < 0.05) * 100 / length(upvals_thyroid_female),2)
prop_thyroid_male = round(sum(upvals_thyroid_male < 0.05) * 100 / length(upvals_thyroid_male),2)
prop_breastCancer = round(sum(upvals_breast < 0.05) * 100 / length(upvals_breast),2)

100-c(prop_thyroid_female, prop_thyroid_male, prop_breastCancer)

# Plot boxplot of all pvalues of for unimodality
library(ggplot2)
pvals = c(upvals_thyroid_female, upvals_thyroid_male, upvals_breast)
fulldata = data.frame(pvals)
fulldata$data_label = c(rep("Thyroid_female", length(upvals_thyroid_female)), rep("Thyroid_male", length(upvals_thyroid_male)), rep("Breast_cancer", length(upvals_breast)))

# Boxplot: Unimodality Test
ggplot(fulldata, aes(x=data_label, y=pvals, fill = data_label)) + geom_boxplot() + 
  ggtitle("P-values of Hartigans' Dip Test for Unimodality") + 
  geom_hline(yintercept = 0.05, lty = "dashed" , color = "red") +
  geom_text(aes(1.3, 0.05, label = "pval = 0.05", vjust = -1), size = 3, col = "red")

# Check how many of the genes have Symmetric Distribution over all samples
# Find which genes are not normal but symmetric
nnu_GTEx_female = nn_GTEx_female[which(upvals_thyroid_female >= 0.05),]
nnu_GTEx_male = nn_GTEx_male[which(upvals_thyroid_male >= 0.05),]
nnu_breast = nn_breast[which(upvals_breast >= 0.05),]

# Symmetry is tested using the method of Cabilio-Masaro
library(lawstat)
# rand_sample = sample(1:nrow(GTEx_thyroid_female), 50)
spvals_thyroid_female = apply(nnu_GTEx_female[1:(nrow(nnu_GTEx_female)-10),], MARGIN = 1, function(x){print("working");symmetry.test(as.numeric(x), "CM")$p.value})
spvals_thyroid_female = unlist(spvals_thyroid_female)
spvals_thyroid_male = apply(nnu_GTEx_male[1:(nrow(nnu_GTEx_male)-10),], MARGIN = 1, function(x){print("working");symmetry.test(as.numeric(x), "CM")$p.value})
spvals_thyroid_male = unlist(spvals_thyroid_male)
# rand_sample = sample(1:nrow(GSE19783_miRNA_mRNA_expression), 50)
spvals_breast = apply(nnu_breast[1:(nrow(nnu_breast)-10),], MARGIN = 1, function(x){print("working");symmetry.test(as.numeric(x), "CM")$p.value})
spvals_breast = unlist(spvals_breast)

# compute proportion of p-values less than 0.05
prop_thyroid_female = round(sum(spvals_thyroid_female < 0.05) * 100 / length(spvals_thyroid_female),2)
prop_thyroid_male = round(sum(spvals_thyroid_male < 0.05) * 100 / length(spvals_thyroid_male),2)
prop_breastCancer = round(sum(spvals_breast < 0.05) * 100 / length(spvals_breast),2)

100-c(prop_thyroid_female, prop_thyroid_male, prop_breastCancer)

# Plot boxplot of all pvalues of Symmetry Test
library(ggplot2)
pvals = c(spvals_thyroid_female, spvals_thyroid_male, spvals_breast)
fulldata = data.frame(pvals)
fulldata$data_label = c(rep("Thyroid_female", length(spvals_thyroid_female)), rep("Thyroid_male", length(spvals_thyroid_male)), rep("Breast_cancer", length(spvals_breast)))

# Boxplot: Symmetry Test
ggplot(fulldata, aes(x=data_label, y=pvals, fill = data_label)) + geom_boxplot() + 
  ggtitle("P-values of Test for Symmetry") + 
  geom_hline(yintercept = 0.05, lty = "dashed" , color = "red") +
  geom_text(aes(1.3, 0.05, label = "pval = 0.05", vjust = -1), size = 3, col = "red")

########## YEAST single-cell psedobulk data ##############
# Only genes with zero everywhere are discarded
pseudobulk_nonzero_logcounts <- read.delim("/home/ubuntu/pseudobulk_nonzero_logcounts.tsv")
pseudobulk_perc02_logcounts <- read.delim("/home/ubuntu/pseudobulk_perc02_logcounts.tsv")
yeast_data = pseudobulk_perc02_logcounts
# Check how many of the genes have Normal Distribution over all samples
pvals_yeast = unlist(apply(yeast_data[,-1], MARGIN = 1, function(x){ks.test(as.numeric(x), rnorm(100, mean(x), sd(x)))$p.value}))
# Plot boxplot of all pvalues of KS Test for normality
library(ggplot2)
pvals = pvals_yeast
fulldata = data.frame(pvals)
fulldata$data_label = rep("yeast_pseudobulk", length(pvals))

# Boxplot: Normality Test
ggplot(fulldata, aes(x=data_label, y=pvals, fill = data_label)) + geom_boxplot() + 
  ggtitle("P-values of Kolmogorov-Smirnov Test for Normality (Yeast)") + 
  geom_hline(yintercept = 0.05, lty = "dashed" , color = "red") + xlab("") +
  geom_text(aes(0.5, 0.05, label = "pval = 0.05", vjust = -1), size = 3, col = "red")

# Percentage of non-significant results
100*sum(pvals_yeast >= 0.05)/length(pvals_yeast)
